#' @title GlowwormScale
#' @description Extract data from Seurat file or Counts Matrix, run normalization and background correction. Perform iterative comparison of target gene list compared to randomly selected backround gene list.
#' @param Input Seurat file or counts matrix (as dataframe, matrix or sparse matrix) with genes as row names and cell barcodes as column names
#' @param MetaData A dataframe of metadata with barcodes as row names. Default pulls from the Seurat metadata slot
#' @param ClassName A string containing the column name from the metadata corresponding to the cell class annotation (i.e. Blood)
#' @param SubclassName A string containing the column name from the metadata corresponding to the cell subclass annotation (i.e. CD4+ T Cell). Default is NA.
#' @param Genes A vector of genes you wish to analyze
#' @param GlowwormGenesOutput Results from running GlowwormGenesOutput, including the slot of either 'UniqueGenesinWindow' or 'NearestNeighborGene'
#' @param Verbose Boolean to print progress. Default is TRUE.
#' @param SecondaryGenes The number of secondary gebes to be used in each background list. Default is equal to the length of the input gene list. [[A previously run GlowwormScale object.]]
#' @param BackgroundCorrect Boolean to use background correction. Default is TRUE.
#' @param PercentCutoff To remove background signal, clusters which express a candidate gene in less than a
#' percentage of cells will be reduced to zero. Default is 0.1 (10%)
#' @param seed Seed for selecting the background gene list. Default is 10.
#' @param Iterations Number of background iterations. Default is 5.
#' @param ScaleType Use either 'MinMax' or 'ZScore' scaling. Default is MinMax.
#' @param ConservedGenes If using multiple datasets, input a list of shared protein coding genes across all datasets.
#' @return This function takes a Seurat or Counts matrix, extracts data and merges with metadata.
#' This data is scaled and has background correction \cr \cr Exports a list containing:
#' \itemize{
#' \item MetaData - A copy of the metadata
#' \item GlowwormScaleOutput - A scaled data frame of n genes by n populations in metadata
#' \item RankScores - A data frame indicating the significantly enriched cell populations in the target gene list compared to the background lust
#' \item SumStats - A data frame showing expression level and specificity for each gene.
#' \item Settings - A list of settings
#' }
#' @importFrom matrixStats rowSds
#'
#' @export
GlowwormScale = function(Input, MetaData = Input@meta.data, ClassName, SubclassName = NA, Genes, GlowwormGenesOutput = NA, PercentCutoff = 0.10, Verbose = T, SecondaryGenes = length(Genes), BackgroundCorrect =T, seed = 10, ConservedGenes, ScaleType = "MinMax", Iterations = 5){
#Select background genes
if(any(Genes == names(GlowwormGenesOutput))){GeneList = GlowwormGenesOutput[[Genes]]}else if(is.na(GlowwormGenesOutput)){GeneList = Genes}else{stop("If using GlowwormGenes to determine gene inputs the GlowwormGenesOutput parameter should be set to the output of GlowwormGenes, and the Genes parameter should indicate 'UniqueGenesinWindow' or 'NearestNeighborGene'")
}
#Check candidate genes are non zero - need to be fixed
if(length(unique(GeneList))== 0){stop("No genes in input - check results of GlowwormGenesOutput")}
if(Verbose == T){cat("| 0% 20% 40% 60% 80% 100% |\n| ##")}
#Reorganize Metdata
MetaData <- dplyr::rename(MetaData, Class = all_of(ClassName))
if(is.na(SubclassName)){
MetaDataComb = MetaData %>% dplyr::select(Class)
}else{
MetaData <- dplyr::rename(MetaData, Subclass = all_of(SubclassName))
MetaDataComb = MetaData %>% dplyr::select(Class, Subclass)
}
OutputList = list()
OutputList[["MetaData"]] = MetaDataComb
if(is.na(SubclassName ) == F){
MetaDataComb$Combined = paste(MetaDataComb[["Subclass"]], MetaDataComb[["Class"]], sep="|")
MetaDataComb[["Subclass"]] = NULL
MetaDataComb[["Class"]] = NULL
}else{
MetaDataComb$Combined = paste(MetaDataComb[["Class"]], MetaDataComb[["Class"]], sep="|")
MetaDataComb[["Class"]] = NULL
}
if(Verbose == T){cat("##")}
#Organize Genes
if(class(Input) =="Seurat"){
GetGenes = subset(GeneList, GeneList %in% row.names(Input@assays$RNA@counts))
GenesNotIn = subset(GeneList, ! GeneList %in% row.names(Input@assays$RNA@counts))
RemainingGenes = subset(row.names(Input@assays$RNA@counts), ! row.names(Input@assays$RNA@counts) %in% GeneList)
}else if(class(Input) %in% c("dgCMatrix", "matrix", "data.frame")){
GetGenes = subset(GeneList, GeneList %in% row.names(Input))
GenesNotIn = subset(GeneList, ! GeneList %in% row.names(Input))
RemainingGenes = subset(row.names(Input), ! row.names(Input) %in% GeneList)
}else{stop("Input file needs to be a Seurat File, or a matrix, sparse matrix or data frame with genes as row names and barcodes as column names")}
if(Verbose == T){cat("##")}
Settings = list("InputFile" = as.character(substitute(Input)),
"TargetGenes" = GeneList,
"MissingTargetGenes" = GenesNotIn,
#"SecondaryGenes" = list(),
"N_Classes" = length(unique(OutputList[["MetaData"]]$Class)),
"N_Subclasses" = length(unique(OutputList[["MetaData"]]$Subclass)),
"N_TotalClusters" = length(unique(MetaDataComb$Combined)),
"PercentCutoff" = PercentCutoff)
SecGenesList = list()
CompileSecondary = as.character()
#Select background genes
if(class(SecondaryGenes) == "list"){
SecGenesList = SecondaryGenes
SecGenesNotIn =character()
for(iter in names(SecGenesList)){
MissingSec = subset(SecGenesList[[iter]], ! SecGenesList[[iter]] %in% RemainingGenes)
SecGenesNotIn = unique(c(SecGenesNotIn, MissingSec))
SecGenesList[[iter]] = subset(SecGenesList[[iter]], SecGenesList[[iter]] %in% RemainingGenes)
CompileSecondary = unique(c(CompileSecondary, SecGenesList[[iter]]))
BackgroundType = "selected"
}
}else if(class(SecondaryGenes) %in% c("numeric", "integer") & SecondaryGenes > 0){
BackgroundType = "randomly generated"
for(iter in c(1:Iterations)){
set.seed(seed+iter)
SecGenesList[[paste0("Iteration", iter)]] = sort(sample(RemainingGenes, SecondaryGenes, replace=F))
CompileSecondary = unique(c(CompileSecondary, SecGenesList[[paste0("Iteration", iter)]]))
SecGenesNotIn = NULL
}
Settings[["SecondaryGenes"]] = SecGenesList
Settings[["UniqueSecondaryGenes"]] = CompileSecondary
}else if(length(SecondaryGenes) == 0){stop("SecondaryGenes should contain either the number of randomly selected background genes to use genes, or a string of genes to use as secondary genes. At least an equal number is recommended, but default is set to 100")}
if(Verbose == T){cat("##")}
#Pull data and scale and perform background correction
AllGenes = c(GetGenes, CompileSecondary)
Sections = ceiling(length(AllGenes)/50)
if(Sections == 1){UpdateAt = 1; Sign = "####################"
}else if(Sections %in% c(2,3)){UpdateAt = c(1,2); Sign = "##########"
}else if(Sections == 4){UpdateAt = seq(1,4,1); Sign = "#####"
}else if(Sections %in% c(5,6)){UpdateAt = seq(1,5,1); Sign = "####"
}else if(Sections >= 7 & Sections <= 8){UpdateAt = c(1,3,4,5,7); Sign = "####"
}else if(Sections == 9){UpdateAt = c(1,3,5,7,9); Sign = "####"
}else if(Sections >= 10 & Sections <= 11){UpdateAt = seq(1,10,1); Sign = "##"
}else if(Sections >= 12 & Sections <= 13){UpdateAt = c(1,2,3,5,6,7,8,10,11,12); Sign = "##"
}else if(Sections >= 14 & Sections <= 15){UpdateAt = c(1,2,3,5,7,8,10,11,13,14); Sign = "##"
}else if(Sections >= 16 & Sections <= 17){UpdateAt = c(1,2,3,5,6,8,9,10,12,13,15,16); Sign = "##"
}else if(Sections >= 18 & Sections <= 19){UpdateAt = c(1,2,3,4,6,7,8,10,11,13,14,15,17,18); Sign = "##"
}else if(Sections > 20){UpdateAt = seq(round(Sections/20), Sections, round(Sections/20)); Sign = "#"}
scDataList = list()
StartPt = 1
for(x in 1:Sections){
SectionGenes = na.omit(AllGenes[StartPt:(StartPt+49)])
StartPt = StartPt+50
if(class(Input) =="Seurat"){
scData = Input@assays$RNA@counts[SectionGenes, row.names(MetaData)]
}else if(class(Input) %in% c("dgCMatrix", "matrix", "data.frame")){
scData = Input[SectionGenes, row.names(MetaData)]
}
if(length(SectionGenes) == 1){
if(ScaleType == "ZScore"){
scData = (scData-mean(scData))/sd(scData) #Zscore
}else if(ScaleType == "MinMax"){
scData = (scData- min(scData)) /(max(scData)-min(scData)) #
}
scData = as.matrix(scData)
colnames(scData) = SectionGenes
}else{
if(ScaleType == "ZScore"){
scData = apply(scData, 1, function(x) (x-mean(x))/sd(x)) #Zscore
}else if(ScaleType == "MinMax"){
scData = apply(scData, 1, function(x) (x- min(x)) /(max(x)-min(x))) #
}}
scData[is.na(scData)] = 0
scData = merge(scData, MetaDataComb, by = 0)
row.names(scData) = scData$Row.names; scData$Row.names = NULL
if(BackgroundCorrect == F){
if(x != 1){ #Scale but no background correction
scData$Combined = NULL}
scDataList[[x]] = scData
}else if(BackgroundCorrect == T){
scData[scData <= 0] <- 0
scData_Pcts = as.data.frame(scData %>% group_by(Combined) %>% summarise_all(function(y){
sum(as.numeric(y) > 0)/n()}))
for(z in SectionGenes){
ClustToReset = subset(scData_Pcts, scData_Pcts[[z]] > 0 & scData_Pcts[[z]] < PercentCutoff)
scData[[z]][scData$Combined %in% ClustToReset$Combined] <- 0
}
if(x != 1){ scData$Combined = NULL}
scDataList[[x]] = scData
}
if(Verbose == T & x %in% UpdateAt){cat(Sign)}
}
#Combine data and format for downstream use
scDataAll = as.data.frame(scDataList, optional=T)
if(length(c(GenesNotIn, SecGenesNotIn))>0){
for(x in c(GenesNotIn, SecGenesNotIn)){
scDataAll[[x]] = 0
}}
OutputList[["GlowwormScaleAllCells"]] = scDataAll
if(Verbose == T){cat("##")}
#Concatenate data for reduced version
scDataReduced = scDataAll %>% group_by(Combined) %>% summarise_all(mean, na.rm = TRUE) %>% as.data.frame()
row.names(scDataReduced) = scDataReduced$Combined
scDataReduced$Combined = NULL
OutputList[["GlowwormScaleOutput"]] = scDataReduced
if(Verbose == T){cat("##")}
#Generate summary statistics
scDataReducedTarget = scDataReduced %>% dplyr::select(GetGenes)
SumStats = as.data.frame(colSums(scDataReducedTarget))
colnames(SumStats) = "Expression"
SumStats$StDev = colSds(as.matrix(scDataReducedTarget))
SumStats[SumStats==Inf] <- 0
SumStats$Expression = (SumStats$Expression- min(SumStats$Expression))/(max(SumStats$Expression) - min(SumStats$Expression))
SumStats$StDev = (SumStats$StDev- min(SumStats$StDev))/(max(SumStats$StDev) - min(SumStats$StDev))
SumStats$RankScore = SumStats$Expression + SumStats$StDev
SumStats = SumStats[order(-SumStats$RankScore),]
if(Verbose == T){cat("##")}
#T-tests
IterTTest = list()
for(iter in 1:Iterations){
i = 0
CompileTRes = as.data.frame(matrix(ncol = 5, nrow=0))
colnames(CompileTRes) = c("Combined", "P.val", "meanDiff")
AllPops = ceiling(length(unique(scDataAll$Combined))/40)
Progress =seq(1,length(unique(scDataAll$Combined)), AllPops)
for(Comb in unique(scDataAll$Combined)){
GetTargets = scDataAll[which(scDataAll$Combined == Comb),]#subset(scDataAll, scDataAll$Combined == Comb)
GetTargets[is.na(GetTargets)] = 0
TargetGenes = GetTargets[, GetGenes]#GetTargets %>% dplyr::select(GetGenes)
SecGenes= GetTargets[, SecGenesList[[paste0("Iteration", iter)]]]#GetTargets %>% dplyr::select(SecGenesList[[paste0("Iteration", iter)]]) ###CompileSecondary
AvgbyCell = rowMeans(TargetGenes)
AvgbyCell_Secondary = rowMeans(SecGenes)
if(sd(AvgbyCell) == sd(AvgbyCell_Secondary)){
TresCompile = as.data.frame(t(c(Comb, mean(AvgbyCell), mean(AvgbyCell_Secondary), 1, 0)))
colnames(TresCompile) = c("Combined", "AvgTarget", "AvgSecondary", "P.val", "meanDiff")
CompileTRes = rbind(CompileTRes, TresCompile)
}else{
Tres = t.test(AvgbyCell, AvgbyCell_Secondary, paired = TRUE, alternative = "two.sided")
TresCompile = as.data.frame(t(c(Comb, mean(AvgbyCell), mean(AvgbyCell_Secondary), as.numeric(Tres[[3]]), as.numeric(Tres[[5]]))))
colnames(TresCompile) = c("Combined", "AvgTarget", "AvgSecondary", "P.val", "meanDiff")
CompileTRes = rbind(CompileTRes, TresCompile)
}
i = i+1
if(i %in% Progress){cat("#")}
}
CompileTRes$P.val = as.numeric(CompileTRes$P.val)
CompileTRes$P.val.adj = p.adjust(CompileTRes$P.val, "BH")
CompileTRes$meanDiff = as.numeric(CompileTRes$meanDiff)
CompileTRes$AvgTarget = as.numeric(CompileTRes$AvgTarget)
CompileTRes$AvgSecondary = as.numeric(CompileTRes$AvgSecondary)
#Calculate frequency of target genes for each population
GeneData_t_Binary = as.data.frame(ifelse(scDataReduced > 0, 1, 0))
GeneData_t_Binary[is.na(GeneData_t_Binary)] = 0
BinaryTarget = GeneData_t_Binary[, GetGenes]
BinarySecondary = GeneData_t_Binary[, SecGenesList[[paste0("Iteration", iter)]]]#GeneData_t_Binary %>% dplyr::select(SecGenesList[[paste0("Iteration", iter)]]) ###CompileSecondary
SumBinary = as.data.frame(rowSums(BinaryTarget))
colnames(SumBinary) = "FrequencyTarget"
SumBinary$FrequencySecondary = rowSums(BinarySecondary)
SumBinary$FrequencyDeltaSub = SumBinary$FrequencyTarget - SumBinary$FrequencySecondary
SumBinary$FrequencyDeltaDiv = SumBinary$FrequencyTarget / SumBinary$FrequencySecondary
SumBinary[is.na(SumBinary)] = 0
SumBinary$FrequencyDeltaDiv = ifelse(is.infinite(SumBinary$FrequencyDeltaDiv) == T, SumBinary$FrequencyTarget, SumBinary$FrequencyDeltaDiv)
#Generate a column with which genes are expressed in each cell population
SumBinary$TargetGenesExpressed=apply(BinaryTarget,1,function(x) paste(names(BinaryTarget)[which(x==1)], collapse="/"))
SumBinary$SecondaryGenesExpressed=apply(BinarySecondary,1,function(x) paste(names(BinarySecondary)[which(x==1)], collapse="/"))
TopDeltas = merge(CompileTRes, SumBinary,by.x = "Combined", by.y = 0)
TopDeltas$FreqTarget_Percent = TopDeltas$FrequencyTarget/length(GetGenes)
TopDeltas$FreqBackground_Percent = TopDeltas$FrequencySecondary/length(SecGenesList[[paste0("Iteration", iter)]]) ###CompileSecondary
TopDeltas$Percent_DeltaSub = TopDeltas$FreqTarget_Percent-TopDeltas$FreqBackground_Percent
TopDeltas = data.frame(TopDeltas[order(-TopDeltas$Percent_DeltaSub),])
TopDeltas$Significant = ifelse(TopDeltas$P.val.adj < 0.05 & TopDeltas$FrequencyTarget > 1 & TopDeltas$Percent_DeltaSub > 0 & TopDeltas$meanDiff > 0, "Enriched", "")
TopDeltas$Subclass = gsub("\\|.*", "", TopDeltas$Combined)
TopDeltas$Class = gsub(".*\\|", "", TopDeltas$Combined)
nSig = subset(TopDeltas, TopDeltas$Significant == "Enriched")
IterTTest[[iter]] = TopDeltas
if(iter == 1){
TopDeltasShort = TopDeltas %>% dplyr::select(c("Combined", "Percent_DeltaSub" , "meanDiff", "Significant", "FrequencyTarget"))
TopDeltasShort$Iteration = "Iter1"
}else{
TopDeltasShortV2 = TopDeltas %>% dplyr::select(c("Combined", "Percent_DeltaSub" , "meanDiff", "Significant", "FrequencyTarget"))
TopDeltasShortV2$Iteration = paste("Iter", iter)
TopDeltasShort = rbind(TopDeltasShort, TopDeltasShortV2)
}
}
TopDeltasShort$Binary = ifelse(TopDeltasShort$Significant == "Enriched", 1, 0)
TopDeltasEnriched = TopDeltasShort %>% group_by(Combined) %>% dplyr::summarize("nEnriched" = sum(Binary), "avgMeanDiff" = mean(meanDiff), "avgPercent" = mean(Percent_DeltaSub), "avgFreq" = mean(FrequencyTarget))
TopDeltasEnriched = TopDeltasEnriched[order(-TopDeltasEnriched$avgPercent),]
TopDeltasEnriched$Significant = ifelse(TopDeltasEnriched$nEnriched == Iterations, "Enriched", "")
setClass("GlowwormObj", representation(MetaData = "data.frame", GlowwormScaleOutput = "data.frame", RankScores = "data.frame", SumStats = "data.frame", Settings = "list"))
Output = new("GlowwormObj",
MetaData = OutputList[["MetaData"]],
GlowwormScaleOutput = OutputList[["GlowwormScaleOutput"]],
RankScores = TopDeltasEnriched,
SumStats = SumStats,
Settings = Settings)
if(Verbose == T){cat(paste("###### | \nGlowwormScale complete for ", Settings$InputFile, "\nFrom ", length(GeneList), " input genes, ", length(GetGenes), " genes were processed. ", length(GenesNotIn), " genes were not in input data.\n", BackgroundType, " background genes were used.\n", sep=""))}
##
setMethod("show",signature = signature(object = "GlowwormObj"),
function(object){
print_message <- paste(c("\nGlowwormScale for ", Settings$InputFile, "\nFrom ", length(GeneList), " input genes, ", length(GetGenes), " genes were processed. ", length(GenesNotIn), " genes were not in input data.\n", BackgroundType, " background genes were used.\n", dim(nSig)[1], "populations are significantly enriched.", sep=""))
cat(print_message,sep = "")
})
return(Output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.